# imports
import string
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import image as img
%matplotlib inline
from sklearn.neighbors import KNeighborsClassifier
import scipy.cluster.hierarchy as shc
from sklearn.cluster import AgglomerativeClustering
# read in the Bob Ross CSV
br = pd.read_csv('elements-by-episode.csv')
br.head()
# See which elements are the most common
element_counts = []
elements = []
for col in br.columns.values[2:]:
elements.append(col)
element_counts.append(sum(br[col]))
# Graph the most common items
fig = plt.figure(figsize=(15, 5))
y_pos = np.arange(len(br.columns.values[2:]))
ax = plt.subplot(111, xlabel='Element', ylabel='Number of appearances', title='Most Common Elements of a Bob Ross Painting')
plt.xticks(y_pos, elements, rotation=90)
ax.bar(y_pos, element_counts)
plt.show()
The 5 most common Bob Ross Painting elements are Trees (background), Tree (foreground), Deciduous, Conifer, and Clouds. Use these to make the quintessential Bob Ross painting.
# define a variable containing only the boolean data
data = br[br.columns.values[2:]]
Initially, we had planned on performing k-means clustering on the data to sort the paintings into similar groups. Unfortunately, we found that clustering is not particularly well suited to binary data. This is because sci-kit learn's package works best with Euclidean distance, but binary data works best with hamming or cosine. We decided to perform a different kind of clustering and visualize it using a dendrogram, despite being unable to use consine distance. This is an example of a "happy little accident." Although we were unable to cluster the data in the meaningful way that we had hoped, we learned quite a bit about using the correct metrics for the correct form of data.
# Create hierarchical clusters to sort the paintings
cluster = AgglomerativeClustering(n_clusters=3, affinity='euclidean', linkage='ward')
cluster.fit_predict(data)
# Show the dedrogram that displays cluster choices (small version)
# https://www.displayr.com/what-is-dendrogram/
plt.figure(figsize=(20, 7))
plt.title("Bob Ross Painting Cluster Dendogram")
plt.xticks(np.arange(len(br['TITLE'])), br['TITLE'])
dend = shc.dendrogram(shc.linkage(data, method='ward'))
# Show the dedrogram that displays cluster choices (large version)
fig = plt.figure(figsize=(15, 150))
ax = plt.subplot(111, xlabel='x', ylabel='y', title='Bob Ross Cluster Dendrogram')
plt.yticks(np.arange(len(br['TITLE'])), br['TITLE'])
for item in ([ax.title, ax.xaxis.label, ax.yaxis.label] +
ax.get_xticklabels() + ax.get_yticklabels()):
item.set_fontsize(20)
dend = shc.dendrogram(shc.linkage(data, method='ward'), orientation='right', leaf_font_size=15, labels=list(br['TITLE']))
This is our Bob Ross Gallery Generator. We created a method that takes a user input of a painting name and a desired number of paintings for the gallery. It will then display the suggested gallery based on the k-nearest neighbors to the user requested image. We learned from the clustering adventure and this time used the hamming distance metric for our KNN model to get more accurate galleries.
def createGallery(name, n):
name = name.upper()
# Find the index of the desired painting
i = 0
index = 0
for title in br['TITLE']:
if name == title[1:-1]:
index = i
i += 1
# Create the KNN Model
# Use the Guest column as the target, since it isn't an element of the painting
target = br['GUEST']
data = br[br.columns.values[2:]]
neigh = KNeighborsClassifier(n_neighbors=n, metric='hamming')
neigh.fit(data, target)
# Find the k nearest neighbors
test = br.iloc[index, :]
neighbors = neigh.kneighbors(test[2:].values.reshape(1,-1), n_neighbors=n, return_distance=False)
# Print out the names of the nearest neighbors along with the image it corresponds to
for neighbor in neighbors:
counter = 2;
# Print the episode and title
for i in neighbor:
# store the episode and title
et = br.iloc[i, 0:2]
episode = et["EPISODE"]
title = et["TITLE"]
#store the image
img_title = 'Images/' +title[1:-1].lower().translate(str.maketrans('','', string.punctuation)) + '.png'
image_name = img_title.replace(' ', '')
try:
image = img.imread(image_name, 0)
#show image
fig = plt.figure(figsize = (10, (n+1)*5))
ax = fig.add_subplot((n+1), 1, counter)
ax.set_title(episode + ' ' + title)
ax.axes.get_xaxis().set_visible(False)
ax.axes.get_yaxis().set_visible(False)
ax.imshow(image)
except:
# If the painting isn't saved, print an error message
print('Sorry! I couldn\'t find ' + title + ' :(')
counter += 1
# Test CreateGallery with a winter themed picture
createGallery('WINTER SAWSCAPE', 5)
# Test CreateGallery with a beach themed picture
createGallery('SEASCAPE', 5)
# Our final gallery
createGallery('A WALK IN THE WOODS', 12)